In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

How to evaluate unsupervised models?

Make them supervised!


In [2]:
from sklearn.datasets import fetch_olivetti_faces

dataset = fetch_olivetti_faces(shuffle=True, random_state=0)
faces = dataset.data
faces.shape


Out[2]:
(400L, 4096L)

In [3]:
def plot_images(data):
    fig, axes = plt.subplots(3, 4, subplot_kw={'xticks':(), 'yticks':()})
    for i, ax in enumerate(axes.ravel()):
        ax.imshow(data[i].reshape(64, 64), cmap="gray")

In [4]:
plot_images(faces)



In [5]:
from sklearn.decomposition import RandomizedPCA
pca = RandomizedPCA(n_components=12).fit(faces)
print(pca.components_.shape)
plot_images(pca.components_)


(12L, 4096L)

In [6]:
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=12).fit(faces)
print(fa.components_.shape)
plot_images(fa.components_)


(12L, 4096L)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

fa_pipe = make_pipeline(FactorAnalysis(n_components=12), LogisticRegression())
pca_pipe = make_pipeline(RandomizedPCA(n_components=12), LogisticRegression())

In [8]:
from sklearn.cross_validation import cross_val_score

fa_scores = cross_val_score(fa_pipe, faces, dataset.target, cv=5)
pca_scores = cross_val_score(pca_pipe, faces, dataset.target, cv=5)
print("Factor analysis scores: %f   PCA scores: %f" % (np.mean(fa_scores), np.mean(pca_scores)))


Factor analysis scores: 0.862500   PCA scores: 0.835000

Use validation data likelihood (or density) for probabilistic models

Use structural properties in Clustering

Visualize, inspect, validate

To cross-validate or not to cross-validate


In [ ]: